#Load & format data & libraries
CEACAM5_formated_data <- read.csv("CEACAM5_formated_data.csv")
  View(CEACAM5_formated_data)

working_df <- CEACAM5_formated_data
working_df[working_df == "TRUE"] <- "T"
working_df[working_df == ""] <- NA

library(ggplot2)

#gets out only variant sites for each haplotype and prints them as a vector
#also puts all unquie vectors into a single list
#column 52 is the last column for sites within the N-domain for CEACAM5

haplotypes <- list()
hap_strings <- NULL

for (z in seq(nrow(working_df))){var_vect <- NULL
    for(i in working_df[z,1:52]){
        if((is.na(i) == FALSE) & (class(i) == "integer")) {
            var_vect <- c(var_vect, i)
        }
    }
    if(!(paste(var_vect, collapse = "") %in% hap_strings)){
        haplotypes[[length(haplotypes)+1]] <- c(list(var_vect), z); 
        hap_strings <- c(hap_strings, paste(var_vect, collapse = "")); 
        print(var_vect)}
        else{
            index <- which(hap_strings == paste(var_vect, collapse = "") )
            haplotypes[[index]][[2]] <- append(haplotypes[[index]][[2]], z )
        }
    }


#combines counts and frequency values for identical N-domain haplotypes
#Note: there are multiple N-domain haplotypes, because originally the haplotypes included variants outside of the N-domain as well

hap_df <- NULL

for(z in seq(length(haplotypes))){
    total_freqs <- colSums(working_df[unlist(haplotypes[[z]][2]), 58:ncol(working_df)])
    new_row <- c(paste(unlist(haplotypes[[z]][1]), collapse = ", "), total_freqs)
    hap_df <- rbind(hap_df, new_row)
}

hap_df <- as.data.frame(hap_df)
colnames(hap_df)[1] <- "var_sites"

# changes all count and frequency values in hap_df from factors to numeric
for(k in seq(2, 15)){
    hap_df[,k] <- as.numeric(as.character(hap_df[,k]))
}

rm(hap_strings, i, k, index, new_row, total_freqs, var_vect, z)

# get reference sequence data

total_pop_counts <- c(1322,694, 1008, 1006, 978, 1010)
total_pop_counts <- t(as.data.frame(total_pop_counts))
pop_names <- c("AFR", "AMR", "EAS", "EUR", "SAS", "GGVP.ALL")
colnames(total_pop_counts) <- pop_names
total_pop_counts <- as.data.frame(total_pop_counts)


ref_freq <- 1- sum(hap_df$Frequency)
ref_count <- 6018 - sum(hap_df$count)
ref_data <- cbind("REF", ref_freq, ref_count)


for(i in pop_names){
    print(i)
    pop_freq <- 1 - sum(hap_df[[i]])
    count_local <- which(colnames(hap_df) == i) + 1
    pop_count <- total_pop_counts[[i]] - sum(hap_df[[count_local]])
    ref_data <- cbind(ref_data, pop_freq, pop_count)
}


ref_data <- as.data.frame(ref_data)

for(k in seq(2, 15)){
    ref_data[,k] <- as.numeric(as.character(ref_data[,k]))
}

colnames(ref_data) <- colnames(hap_df)
ref_and_haps <- rbind(ref_data, hap_df)

#adds a new empty column named "group"
ref_and_haps[, "group"] <- NA

#Identify haplotypes with SNPs which change human CEACAM5 sites to match sites in CEACAM1
major_hap_sites <- c(153, 159, 238, 248, 283, 299, 334, 338) 

for(thingy in seq(nrow(ref_and_haps))){
    sites <- strsplit(as.character(ref_and_haps$var_sites[thingy]), ", ")[[1]]
    if("REF" %in% sites){
        ref_and_haps$group[thingy] <- "REF" 
    }
    else if (length(intersect(sites, major_hap_sites)>0)) {
       ref_and_haps$group[thingy] <- "major"
    }
    else {
       ref_and_haps$group[thingy] <- "minor"
    }
}

#minor alleles refers to haplotypes that do not contain any of the SNPs which change the reference CEACAM1 sequence to match CEACAM1
#combine minor alleles into a single catergory

minor_df <- ref_and_haps[which(ref_and_haps$group == "minor"),]
minor_sums <- colSums(minor_df[2:(ncol(minor_df)-1)])
minor_sums <- cbind("Minor", t(minor_sums), "minor")
minor_sums <- as.data.frame(minor_sums)
colnames(minor_sums) <- colnames(ref_and_haps)
for(k in seq(2, 15)){
    minor_sums[,k] <- as.numeric(as.character(minor_sums[,k]))
}

#split major_df between variants
major_df <- ref_and_haps[which(ref_and_haps$group == "major"),]

lowFreqMajor <- cbind("LowFreqMajor", t(colSums(major_df[which(major_df$count <= 20), 2:15])), "LowFreq")
colnames(lowFreqMajor) <- colnames(ref_and_haps)

compressed_df <- rbind(ref_and_haps[which(ref_and_haps$group == "REF"),], major_df[which(major_df$count > 20),], minor_sums, lowFreqMajor)

for(k in seq(2, 15)){
    compressed_df[,k] <- as.numeric(compressed_df[,k])
}

#Code to make pie charts
# To modify below code for data for individual population in the first line starting with "barplot" change y = Frequency to the code for the appropriate population.
# population codes: AFR = Africa, EAS = East Asia, EUR = Europe, SAS = South Asia, GGVP.ALL = Gambia

barplot <- ggplot(compressed_df, aes(x = "", y = Frequency, fill = group)) + geom_bar(width = 1, stat = "identity", color = "black", size = 2, show.legend = FALSE) + scale_fill_manual(values = c("#8dc4a2","#429e64","gray" , "gold"))
pie_chart <- barplot + coord_polar("y", start = 0)
pie_chart + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "transparent",colour = NA),plot.background = element_rect(fill = "transparent",colour = NA), axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())

ggsave("pie.pdf", width = 500, height = 500, units = "mm")